Programming Languages

tidytuesday
Author
Affiliation
Published

March 21, 2023

library(tidyverse)
── Attaching packages ────────────────────────────────── tidyverse 1.3.2.9000 ──
✔ ggplot2   3.4.1     ✔ dplyr     1.1.0
✔ tibble    3.2.0     ✔ stringr   1.5.0
✔ tidyr     1.3.0     ✔ forcats   0.5.1
✔ readr     2.1.2     ✔ lubridate 1.9.0
✔ purrr     1.0.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(skimr)
library(visdat)
languages <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2023/2023-03-21/languages.csv')
Rows: 4303 Columns: 49
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (21): pldb_id, title, description, type, creators, website, domain_name,...
dbl (24): appeared, domain_name_registered, isbndb, book_count, semantic_sch...
lgl  (4): features_has_comments, features_has_semantic_indentation, features...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Skim the Dataset

skimr::skim(languages)
Data summary
Name languages
Number of rows 4303
Number of columns 49
_______________________
Column type frequency:
character 21
logical 4
numeric 24
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
pldb_id 0 1.00 1 52 0 4303 0
title 0 1.00 1 56 0 4267 0
description 3480 0.19 4 2273 0 811 0
type 0 1.00 2 27 0 40 0
creators 3203 0.26 2 253 0 985 0
website 2928 0.32 13 131 0 1368 0
domain_name 3588 0.17 6 32 0 700 0
reference 2314 0.46 15 251 0 1955 0
github_repo 3402 0.21 25 73 0 897 0
github_repo_description 3438 0.20 4 419 0 853 0
github_language 3829 0.11 1 30 0 474 0
github_language_tm_scope 3837 0.11 4 34 0 361 0
github_language_type 3837 0.11 4 11 0 4 0
github_language_ace_mode 3838 0.11 1 16 0 96 0
github_language_file_extensions 3833 0.11 1 606 0 466 0
wikipedia 2731 0.37 32 104 0 1566 0
wikipedia_summary 2884 0.33 17 6741 0 1407 0
wikipedia_related 3145 0.27 1 1761 0 1059 0
line_comment_token 3831 0.11 1 7 0 23 0
origin_community 1190 0.72 3 305 0 2232 0
file_type 3213 0.25 2 6 0 4 0

Variable type: logical

skim_variable n_missing complete_rate mean count
features_has_comments 3683 0.14 1.00 TRU: 617, FAL: 3
features_has_semantic_indentation 3722 0.14 0.11 FAL: 516, TRU: 65
features_has_line_comments 3765 0.13 0.96 TRU: 517, FAL: 21
is_open_source 3792 0.12 0.89 TRU: 453, FAL: 58

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
appeared 0 1.00 1991.11 111.44 -2000 1984.00 1997.0 2012.00 2023 ▁▁▁▁▇
domain_name_registered 3801 0.12 2011.33 7.02 1990 2007.00 2013.0 2017.00 2023 ▁▃▃▇▆
isbndb 3217 0.25 7.71 33.16 0 0.00 0.0 2.00 400 ▇▁▁▁▁
book_count 0 1.00 2.08 17.34 0 0.00 0.0 0.00 401 ▇▁▁▁▁
semantic_scholar 3545 0.18 3.79 8.32 0 0.00 0.0 3.00 52 ▇▁▁▁▁
language_rank 0 1.00 2151.00 1242.31 0 1075.50 2151.0 3226.50 4302 ▇▇▇▇▇
github_repo_stars 3414 0.21 2127.40 7554.02 0 29.00 194.0 1071.00 88526 ▇▁▁▁▁
github_repo_forks 3417 0.21 261.29 1203.00 0 2.25 16.0 91.50 23732 ▇▁▁▁▁
github_repo_updated 3418 0.21 2021.39 1.76 2012 2022.00 2022.0 2022.00 2023 ▁▁▁▁▇
github_repo_subscribers 3418 0.21 62.34 200.88 0 4.00 13.0 44.00 2910 ▇▁▁▁▁
github_repo_created 3425 0.20 2015.84 3.48 2006 2013.00 2016.0 2019.00 2022 ▁▅▇▇▇
github_repo_issues 3518 0.18 123.03 546.26 0 1.00 9.0 61.00 9522 ▇▁▁▁▁
github_repo_first_commit 3567 0.17 2014.74 4.99 1987 2012.00 2015.0 2018.00 2022 ▁▁▁▆▇
github_language_repos 3833 0.11 197134.67 1226900.57 0 91.25 725.5 7900.25 16046489 ▇▁▁▁▁
wikipedia_daily_page_views 2837 0.34 227.13 783.55 -1 9.00 24.0 99.00 13394 ▇▁▁▁▁
wikipedia_backlinks_count 2877 0.33 318.55 1635.29 1 13.00 39.0 126.00 34348 ▇▁▁▁▁
wikipedia_page_id 2893 0.33 9167847.21 13506832.90 928 375153.75 2114700.5 12321223.00 63063548 ▇▁▁▁▁
wikipedia_appeared 2958 0.31 1991.14 17.03 1830 1980.00 1994.0 2005.00 2019 ▁▁▁▃▇
wikipedia_created 3040 0.29 2005.75 3.77 2001 2003.00 2005.0 2007.00 2020 ▇▇▂▁▁
wikipedia_revision_count 3130 0.27 330.43 813.26 1 35.00 84.0 242.00 10104 ▇▁▁▁▁
last_activity 0 1.00 2000.62 84.60 -900 1992.00 2006.0 2021.00 2023 ▁▁▁▁▇
number_of_users 0 1.00 13771.26 227712.95 0 0.00 20.0 230.00 7179119 ▇▁▁▁▁
number_of_jobs 0 1.00 422.18 12572.99 0 0.00 0.0 0.00 771996 ▇▁▁▁▁
central_package_repository_count 1482 0.66 0.00 0.00 0 0.00 0.0 0.00 0 ▁▁▇▁▁

Looking at Data Science Languages

ds_langs <- c("R", "Julia", "Python", "SAS", "Excel")

languages |>
  select(-contains("wikipedia")) |>
  filter(title %in% ds_langs) |>
  knitr::kable()
pldb_id title description type appeared creators website domain_name domain_name_registered reference isbndb book_count semantic_scholar language_rank github_repo github_repo_stars github_repo_forks github_repo_updated github_repo_subscribers github_repo_created github_repo_description github_repo_issues github_repo_first_commit github_language github_language_tm_scope github_language_type github_language_ace_mode github_language_file_extensions github_language_repos features_has_comments features_has_semantic_indentation features_has_line_comments line_comment_token last_activity number_of_users number_of_jobs origin_community central_package_repository_count file_type is_open_source
python Python NA pl 1991 Guido van Rossum https://www.python.org/ python.org 1995 https://www.programiz.com/python-programming/keyword-list 339 342 52 3 NA NA NA NA NA NA NA NA NA Python source.python programming python py cgi fcgi gyp gypi lmi py3 pyde pyi pyp pyt pyw rpy smk spec tac wsgi xpy 9300725 TRUE TRUE TRUE # 2022 2818037 46976 Centrum Wiskunde & Informatica NA text NA
r R NA pl 1993 Ross Ihaka and Robert Gentleman https://www.r-project.org r-project.org 1999 NA 40 40 9 15 NA NA NA NA NA NA NA NA NA R source.r programming r r rd rsx 689533 TRUE FALSE TRUE # 2022 1075613 14173 University of Auckland NA text TRUE
sas SAS NA pl 1976 Anthony James Barr https://www.sas.com sas.com 1990 NA 94 96 10 35 NA NA NA NA NA NA NA NA NA SAS source.sas programming text sas 8407 TRUE FALSE TRUE * 2022 361103 4682 NA 0 text NA
julia Julia NA pl 2012 Jeff Bezanson and Alan Edelman and Stefan Karpinski and Viral B. Shah http://julialang.org/ julialang.org NA NA 22 22 35 34 https://github.com/JuliaLang/julia 41515 5100 2023 952 2011 The Julia Programming Language 4420 NA Julia source.julia programming julia jl 53507 TRUE FALSE TRUE # 2023 81911 85 https://github.com/JuliaLang NA text TRUE

Date Appeared vs. Number of Users

lang_plot <- languages |>
  filter(appeared > 1980) |>
  ggplot() +
  aes(title=title, x=appeared, y=number_of_users) +
  geom_point() +
  labs(title = "Languages: Date Appeared vs Number of Users after 1980")

plotly::ggplotly(lang_plot)

Language Rank versus Number of Users

lang_plot <- languages |>
  filter(appeared > 1980) |>
  filter(language_rank < 60) |>
  mutate(ds_language = if_else(title %in% c("R", "MATLAB", "SAS", "Julia", "Python"), "Y", "N")) |>
  ggplot() +
  aes(title=title, x=language_rank, y=number_of_users, color=ds_language) +
  geom_point() +
  labs(title = "Languages: Rank vs Number of Users after 1980") +
    scale_color_manual(values=c("Y"="blue", 
                             "N" = "grey"))

plotly::ggplotly(lang_plot)

Citation

BibTeX citation:
@online{laderas2023,
  author = {Ted Laderas},
  title = {Programming {Languages}},
  date = {2023-03-21},
  langid = {en}
}
For attribution, please cite this work as:
Ted Laderas. 2023. “Programming Languages.” March 21, 2023.